In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white", palette=sns.color_palette("RdBu"))
import numpy as np
import pandas as pd
import scipy.io as sio
from scipy import stats
import sys
sys.path.append('..')
from helper import anomaly
from sklearn.cross_validation import train_test_split
You want to divide data into 3 set.
You shouldn't be doing prediction using training data or Validation data as it does in the exercise.
In [2]:
mat = sio.loadmat('./data/ex8data1.mat')
mat.keys()
Out[2]:
In [3]:
X = mat.get('X')
divide original validation data into validation and test set
In [4]:
Xval, Xtest, yval, ytest = train_test_split(mat.get('Xval'),
mat.get('yval').ravel(),
test_size=0.5)
Visualize training data
In [5]:
sns.regplot('Latency', 'Throughput',
data=pd.DataFrame(X, columns=['Latency', 'Throughput']),
fit_reg=False,
scatter_kws={"s":20,
"alpha":0.5})
Out[5]:
In [6]:
mu = X.mean(axis=0)
print(mu, '\n')
cov = np.cov(X.T)
print(cov)
In [7]:
# example of creating 2d grid to calculate probability density
np.dstack(np.mgrid[0:3,0:3])
Out[7]:
In [8]:
# create multi-var Gaussian model
multi_normal = stats.multivariate_normal(mu, cov)
# create a grid
x, y = np.mgrid[0:30:0.01, 0:30:0.01]
pos = np.dstack((x, y))
fig, ax = plt.subplots()
# plot probability density
ax.contourf(x, y, multi_normal.pdf(pos), cmap='Blues')
# plot original data points
sns.regplot('Latency', 'Throughput',
data=pd.DataFrame(X, columns=['Latency', 'Throughput']),
fit_reg=False,
ax=ax,
scatter_kws={"s":10,
"alpha":0.4})
Out[8]:
In [9]:
e, fs = anomaly.select_threshold(X, Xval, yval)
print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e, fs))
In [10]:
multi_normal, y_pred = anomaly.predict(X, Xval, e, Xtest, ytest)
In [11]:
# construct test DataFrame
data = pd.DataFrame(Xtest, columns=['Latency', 'Throughput'])
data['y_pred'] = y_pred
# create a grid for graphing
x, y = np.mgrid[0:30:0.01, 0:30:0.01]
pos = np.dstack((x, y))
fig, ax = plt.subplots()
# plot probability density
ax.contourf(x, y, multi_normal.pdf(pos), cmap='Blues')
# plot original Xval points
sns.regplot('Latency', 'Throughput',
data=data,
fit_reg=False,
ax=ax,
scatter_kws={"s":10,
"alpha":0.4})
# mark the predicted anamoly of CV data. We should have a test set for this...
anamoly_data = data[data['y_pred']==1]
ax.scatter(anamoly_data['Latency'], anamoly_data['Throughput'], marker='x', s=50)
Out[11]:
In [12]:
mat = sio.loadmat('./data/ex8data2.mat')
In [13]:
X = mat.get('X')
Xval, Xtest, yval, ytest = train_test_split(mat.get('Xval'),
mat.get('yval').ravel(),
test_size=0.5)
In [14]:
e, fs = anomaly.select_threshold(X, Xval, yval)
print('Best epsilon: {}\nBest F-score on validation data: {}'.format(e, fs))
In [15]:
multi_normal, y_pred = anomaly.predict(X, Xval, e, Xtest, ytest)
In [16]:
print('find {} anamolies'.format(y_pred.sum()))
The huge difference between my result, and the official 117 anamolies in the ex8 is due to:
In [ ]: